package gov.ornl.stucco.stix_extractors;

import gov.ornl.stucco.utils.STIXUtils;

import java.util.List;
import java.util.ArrayList;
import java.util.Random;
import java.util.TreeSet;
import java.util.UUID;

import javax.xml.namespace.QName;
import javax.xml.datatype.DatatypeConfigurationException;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.mitre.stix.stix_1.STIXPackage;
import org.mitre.stix.stix_1.TTPsType;
import org.mitre.stix.common_1.ControlledVocabularyStringType;
import org.mitre.stix.common_1.ToolInformationType;
import org.mitre.stix.common_1.StructuredTextType;
import org.mitre.stix.common_1.IdentityType;
import org.mitre.stix.common_1.InformationSourceType;
import org.mitre.stix.common_1.RelatedTTPType;
import org.mitre.stix.ttp_1.MalwareInstanceType;
import org.mitre.stix.ttp_1.ToolsType;
import org.mitre.stix.ttp_1.TTP;
import org.mitre.stix.ttp_1.MalwareType;
import org.mitre.stix.ttp_1.BehaviorType;
import org.mitre.stix.ttp_1.ResourceType;
import org.mitre.stix.ttp_1.InfrastructureType;
import org.mitre.cybox.cybox_2.Observables;
import org.mitre.cybox.cybox_2.Observable;
import org.mitre.cybox.cybox_2.OperatorTypeEnum;
import org.mitre.cybox.cybox_2.ObservableCompositionType;
import org.mitre.cybox.cybox_2.RelatedObjectsType;
import org.mitre.cybox.common_2.Property;
import org.mitre.cybox.common_2.CustomPropertiesType;

public class MalwareDomainListExtractor extends STIXUtils {
											
	private static final Logger logger = LoggerFactory.getLogger(MalwareDomainListExtractor.class);
	
	private STIXPackage stixPackage;

	public MalwareDomainListExtractor(String content) {
		stixPackage = extract(content);
	}
					
	public STIXPackage getStixPackage() {
		return stixPackage;
	}
	
	private long convertTimestamp(String time) { 
		return convertTimestamp(time + " (GMT)", "yyyy/MM/dd_hh:mm (z)");
	}
	
	private STIXPackage extract(String content) {
			String dns = null;
			String reverseDns = null;
			MalwareInstanceType malware = new MalwareInstanceType();
			Observables observables = initObservables();
			Observable dnsObservable = null;
			Observable reverseDnsObservable = null;
			TTP ttp = initTTP("MalwareDomainList");
			try {
				stixPackage = initStixPackage("Malware Domain", "MalwareDomainList");
			} catch (DatatypeConfigurationException e) {
				e.printStackTrace();
			}

			Document doc = Jsoup.parse(content);
			Element contentBox = doc.getElementsByClass("ContentBox").first();
			logger.debug(contentBox.html());

			Element table = contentBox.getElementsByTag("table").last();
			logger.debug("Table Contents: {}", table.outerHtml());

			Element headers = table.getElementsByClass("tabletitle").first();
			Elements colTitlesElements = headers.getElementsByTag("td");
			ArrayList<String> colTitles = new ArrayList<String>();
			String currTitle;
			for(int i=0; i<colTitlesElements.size()-1; i++){
				currTitle = colTitlesElements.get(i).text();
				colTitles.add(currTitle);
			}
			colTitles.add("Country"); 
			logger.info("Col titles found: {}", colTitles);

			Elements rows = table.getElementsByTag("tr");
			rows.remove(0); //remove the two header rows
			rows.remove(0);
			logger.debug("Table rows remaining: {}", rows.outerHtml());

			String cells[][] = getCells(rows);

			TreeSet<String> urls = new TreeSet<String>();
			TreeSet<String> domain = new TreeSet<String>();
			TreeSet<String> ips = new TreeSet<String>();
			TreeSet<String> reverse_lookup = new TreeSet<String>();
			TreeSet<String> description = new TreeSet<String>();
			TreeSet<String> registrant = new TreeSet<String>();

			for(int i=0; i<cells.length; i++){
				String desc = cells[i][4];
				if(desc.equals("-")) desc = "";
				String possibleAVName;
				if(cells[i].length != 8){
					logger.warn("Unexpected length of cells[" + i + "]: " + cells[i].length);
				}
				if(cells[i][1].equals("-")){//no 'domain' field, which means 'ip' field will contain the actual url
					String urlString = cells[i][2];
					if(urlString.contains(" ")){ //sometimes, the URL field actually contains URL, space, AVName
						possibleAVName = urlString.split(" ")[1];
						//Somewhere around 2% have this, and almost all are missing their desc. field.
						logger.info("Possible AVName found: " + possibleAVName + " from url " + urlString);
						if(desc.length() >0){
							logger.info("previous description was {}", desc);
							desc = possibleAVName + " " + desc;  //Note: this is somewhere around 0.1%
						}else{
							desc = possibleAVName;
						}
						//logger.info(desc);
						urlString = urlString.split(" ")[0];
					}
					urls.add(urlString);
					try {
						String currIP = getDomainFromURL("http://"+urlString); 
						ips.add(currIP);
					} catch (Exception e) {
						logger.warn("Unparsable URL: " + cells[i][2], e);
					}
				}else{
					String urlString = cells[i][1];
					if(urlString.contains(" ")){ //sometimes, the URL field actually contains URL, space, AVName
						possibleAVName = urlString.split(" ")[1];
						//Somewhere around 2% have this, and almost all are missing their desc. field.
						logger.info("Possible AVName found: " + possibleAVName + " from url " + urlString);
						if(desc.length() >0){
							logger.info("previous description was {}", desc);
							desc = possibleAVName + " " + desc;  //Note: this is somewhere around 0.1%
						}else{
							desc = possibleAVName;
						}
						//logger.info(desc);
						urlString = urlString.split(" ")[0];
					}
					urls.add(urlString);
					try {
						String currDomain = getDomainFromURL("http://"+urlString); 
						domain.add(currDomain);
					} catch (java.net.URISyntaxException e) {
						logger.warn("Unparsable URL: " + cells[i][1]);
					}
					ips.add(cells[i][2]);
				}
				reverse_lookup.add(cells[i][3]);
				description.add(desc);
				registrant.add(cells[i][5]);
			}

			/* malware */

			//name
			String malwareName = "MalwareDomainList_";
			if (domain.size() == 1) {
				malwareName += domain.first();
			} else if (ips.size() == 1) {
				malwareName += ips.first();
			} else {
				logger.warn("No meaningful name could be generated!");
				Random r = new Random();
				malwareName += r.nextInt(1000000000);//TODO any better ideas?
			}
			malware
				.withNames(new ControlledVocabularyStringType()
					.withValue(malwareName));

			// urlsUsed
			if (urls.size() == 0) { //can have more than one
				logger.warn("Expected to find at least one url, but found none!");
			} else {
				ttp
					.withResources(new ResourceType()
						.withTools(setTools("url", urls)));
			}

			// description
			if (description.size() == 0) { //can have more than one
				logger.warn("Expected to find at least one description, but found none!");
			} else {
				StringBuilder sb = new StringBuilder();
				for(String item : description) {
					sb.append(item);
					sb.append(", ");
				}
				String desc = sb.toString();
				desc = desc.substring(0, desc.length() - 2);
				malware
					.withDescriptions(new StructuredTextType()
						.withValue(desc));
			}

			ttp
				.withBehavior(new BehaviorType()
					.withMalware(new MalwareType()
						.withMalwareInstances(malware)))
				.withInformationSource(new InformationSourceType()
					.withIdentity(new IdentityType()
						.withName("MalwareDomainList")));

			/* DNS */
			if (domain.size() > 1) {
				logger.warn("Expected to find zero or one domain, but found " + domain.size());
			} else if (domain.size() == 1) {
				dns = domain.first();
				dnsObservable = setDNSObservable(dns, "MalwareDomainList");

				if (registrant.size() == 1) {
					if (!registrant.first().equals("-")) {
						dnsObservable
							.getObject()
								.getProperties()
									.withCustomProperties(new CustomPropertiesType()
										.withProperties(setCustomProperty("Registrant", registrant.first())));
					}			

				} else {
					logger.warn("Expected to find one registrant, but found " + registrant.size());
				}

				observables
					.withObservables(dnsObservable);

				if (ips.size() == 0) {
					//if were are no IPs, we would need to make an 'address' node to connect to.
					//HOWEVER I don't think that will ever happen.
					//TODO: if you ever see this message, revisit this issue.
					logger.warn("Did not find any IPs!  Some edges can not be created!");
				}//if there were IPs found, just make the edges later, after the address node(s) are created.			
				//else you have no DNS info, which happens sometimes
			}

			/* reverse DNS */
			if (reverse_lookup.size() == 1) {
				reverseDns = reverse_lookup.first();
				reverseDnsObservable = setDNSObservable(reverseDns, "MalwareDomainList");
				observables
					.withObservables(reverseDnsObservable);

			} else {								
				logger.warn("Expected to find one reverse_lookup, but found " + reverse_lookup.size());
			}

			/* port */
			Observable portObservable = null;
			Observable anyPortObservable = null;

			/* ip */
			List<Observable> addressObservableList = new ArrayList<Observable>();
			if (ips.size() == 0) { //can have more than one
				logger.warn("Expected to find at least one ip, but found none!");
			} else {
				for (String ip : ips) {
					Observable ipObservable = setIpObservable(ip, "MalwareDomainList");
					observables
						.withObservables(ipObservable);

					/* address */
					String port = null;
					Observable addressObservable = null;
					if (!urls.isEmpty()) {
						if (portObservable == null) {
							port = "80";
							portObservable = setPortObservable("80", "MalwareDomainList");
							observables
								.withObservables(portObservable);
						}
						addressObservable = setAddressObservable(ip, ipToLong(ip), ipObservable.getId(), "80", portObservable.getId(), "MalwareDomainList");
					} else {
						port = "any";
						if (anyPortObservable == null) {
							anyPortObservable = setPortObservable("any", "MalwareDomainList");
							observables
								.withObservables(anyPortObservable);
						} 
						addressObservable = setAddressObservable(ip, ipToLong(ip), ipObservable.getId(), "any", anyPortObservable.getId(), "MalwareDomainList");
					}
					observables
						.withObservables(addressObservable);
					addressObservableList.add(new Observable()
						.withIdref(addressObservable.getId()));

					RelatedObjectsType relatedObjects = new RelatedObjectsType();
					if (dnsObservable != null) {
						relatedObjects
							.withRelatedObjects(setRelatedObject(dnsObservable.getId()));
					}
					if (reverseDnsObservable != null) {
						relatedObjects
							.withRelatedObjects(setRelatedObject(reverseDnsObservable.getId()));
					}	
					if (!relatedObjects.getRelatedObjects().isEmpty()) {
						addressObservable
							.getObject()
								.withRelatedObjects(relatedObjects);
					}	
				}
			}
			
			if (!addressObservableList.isEmpty()) {
				ResourceType resource = (ttp.getResources() == null) ? new ResourceType() : ttp.getResources();
				resource 
					.withInfrastructure(new InfrastructureType()
						.withObservableCharacterization(initObservables()
							.withObservables(addressObservableList)));
				ttp
					.withResources(resource);
			}
			if (!observables.getObservables().isEmpty()) {
				stixPackage
					.withObservables(observables);
			}	

			ttp
				.withBehavior(new BehaviorType()
					.withMalware(new MalwareType()
						.withMalwareInstances(malware)));

			stixPackage
				.withTTPs(new TTPsType()
					.withTTPS(ttp));

	    return stixPackage;
	}
}
